Load GrahpLab Create


In [1]:
import graphlab

Basic settings


In [2]:
#limit number of worker processes to 4
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8)


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1474350125.log
This non-commercial license of GraphLab Create for academic use is assigned to sudhanshu.shekhar.iitd@gmail.com and will expire on September 18, 2017.

In [3]:
#set canvas to open inline
graphlab.canvas.set_target('ipynb')

Load the Amazon baby data


In [4]:
products = graphlab.SFrame('amazon_baby.gl/')

In [5]:
products.head()


Out[5]:
name review rating
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3.0
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5.0
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5.0
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5.0
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5.0
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5.0
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4.0
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5.0
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5.0
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4.0
[10 rows x 3 columns]


In [6]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

Feature engineering


In [7]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [8]:
products.head()


Out[8]:
name review rating word_count
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3.0 {'and': 5, '6': 1,
'stink': 1, 'because' ...
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5.0 {'and': 3, 'love': 1,
'it': 2, 'highly': 1, ...
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5.0 {'and': 2, 'quilt': 1,
'it': 1, 'comfortable': ...
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5.0 {'ingenious': 1, 'and':
3, 'love': 2, ...
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5.0 {'and': 2, 'parents!!':
1, 'all': 2, 'puppet.': ...
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5.0 {'and': 2, 'this': 2,
'her': 1, 'help': 2, ...
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4.0 {'shop': 1, 'noble': 1,
'is': 1, 'it': 1, 'as': ...
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5.0 {'and': 2, 'all': 1,
'right': 1, 'when': 1, ...
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5.0 {'and': 1, 'help': 1,
'give': 1, 'is': 1, ' ...
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4.0 {'journal.': 1, 'nanny':
1, 'standarad': 1, ...
[10 rows x 4 columns]

1. Use .apply() to build a new feature with the counts for each of the selected_words


In [9]:
for key in selected_words:
    products[key] = 0

In [10]:
products.head()


Out[10]:
name review rating word_count awesome great fantastic
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3.0 {'and': 5, '6': 1,
'stink': 1, 'because' ...
0 0 0
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5.0 {'and': 3, 'love': 1,
'it': 2, 'highly': 1, ...
0 0 0
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5.0 {'and': 2, 'quilt': 1,
'it': 1, 'comfortable': ...
0 0 0
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5.0 {'ingenious': 1, 'and':
3, 'love': 2, ...
0 0 0
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5.0 {'and': 2, 'parents!!':
1, 'all': 2, 'puppet.': ...
0 0 0
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5.0 {'and': 2, 'this': 2,
'her': 1, 'help': 2, ...
0 0 0
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4.0 {'shop': 1, 'noble': 1,
'is': 1, 'it': 1, 'as': ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5.0 {'and': 2, 'all': 1,
'right': 1, 'when': 1, ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5.0 {'and': 1, 'help': 1,
'give': 1, 'is': 1, ' ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4.0 {'journal.': 1, 'nanny':
1, 'standarad': 1, ...
0 0 0
amazing love horrible bad terrible awful wow hate
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
[10 rows x 15 columns]


In [11]:
products[0]['word_count'].keys()


Out[11]:
['and',
 '6',
 'stink',
 'because',
 'ordered',
 'just',
 'boyfor',
 'wipes-ocean',
 'wipes,',
 'replace',
 'not',
 'softer',
 'are',
 'have',
 'in',
 'need',
 'rough',
 'ok,',
 'issues',
 'seemed',
 'use',
 'blue-12',
 'vimse',
 'for',
 'no',
 'that',
 'larger,',
 'been',
 'to',
 'someimse',
 'quality.',
 '8',
 'flannel',
 'worth',
 'higher',
 'them',
 'get',
 'keeping.',
 'countwhich',
 'texture',
 'but',
 'cloth',
 'nicer,',
 'they',
 'hands',
 'fab',
 'now',
 'had',
 'a',
 'also',
 'about',
 'usingthirsties',
 'longer',
 'i',
 'my',
 'months',
 'wipes',
 'these',
 'while',
 'stripping',
 'faces',
 'handles.',
 'opinion',
 'starting',
 'pack']

In [12]:
def key_count(dict, key):
    if key in dict:
        return dict[key]
    else:
        return 0

In [13]:
for key in selected_words:
    products[key] = products['word_count'].apply(lambda x: key_count(x, key))

In [14]:
products.head()


Out[14]:
name review rating word_count awesome great fantastic
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3.0 {'and': 5, '6': 1,
'stink': 1, 'because' ...
0 0 0
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5.0 {'and': 3, 'love': 1,
'it': 2, 'highly': 1, ...
0 0 0
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5.0 {'and': 2, 'quilt': 1,
'it': 1, 'comfortable': ...
0 0 0
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5.0 {'ingenious': 1, 'and':
3, 'love': 2, ...
0 0 0
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5.0 {'and': 2, 'parents!!':
1, 'all': 2, 'puppet.': ...
0 1 0
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5.0 {'and': 2, 'this': 2,
'her': 1, 'help': 2, ...
0 1 0
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4.0 {'shop': 1, 'noble': 1,
'is': 1, 'it': 1, 'as': ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5.0 {'and': 2, 'all': 1,
'right': 1, 'when': 1, ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5.0 {'and': 1, 'help': 1,
'give': 1, 'is': 1, ' ...
0 0 0
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4.0 {'journal.': 1, 'nanny':
1, 'standarad': 1, ...
0 0 0
amazing love horrible bad terrible awful wow hate
0 0 0 0 0 0 0 0
0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 2 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
[10 rows x 15 columns]


In [15]:
products.tail()


Out[15]:
name review rating word_count awesome great
Airline Seat Belt
Extender - The Best ...
Up until recently I have
hated flying, putting on ...
5.0 {'and': 1, 'have': 1,
'being': 1, 'protrudes': ...
0 0
Airline Seat Belt
Extender - The Best ...
I bought this as a
father's day gift for my ...
5.0 {'and': 4, 'dad': 1,
'because': 1, 'being' ...
0 0
Squeasy Snacker 6oz
Silicone Reusable Food ...
I love that these can
hold purees OR liquids, ...
5.0 {'and': 5, 'smaller': 1,
'love': 2, 'clip': 1, ...
0 1
Squeasy Snacker 6oz
Silicone Reusable Food ...
I love this product, it
makes my life easier. ...
5.0 {'and': 6, 'love': 1,
'school': 1, 'just': 1, ...
0 1
Squeasy Snacker 6oz
Silicone Reusable Food ...
This reusable squeeze
bottle is the best I ...
5.0 {'-': 1, 'through': 1,
'go': 1, 'yet': 1, ...
1 0
Baby Teething Necklace
for Mom Pretty Donut ...
Such a great idea! very
handy to have and look ...
5.0 {'and': 1, 'help': 1,
'too,': 1, 'reduce': 1, ...
0 2
Baby Teething Necklace
for Mom Pretty Donut ...
This product rocks! It
is a great blend of ...
5.0 {'accessible': 1, 'and':
5, 'concept': 1, 'is' ...
0 1
Abstract 2 PK Baby /
Toddler Training Cup ...
This item looks great and
cool for my kids....I ...
5.0 {'and': 2, 'great': 2,
'kids....i': 1, 'for' ...
0 2
Baby Food Freezer Tray -
Bacteria Resistant, BPA ...
I am extremely happy with
this product. I have ...
5.0 {'just': 1, 'bimbi': 2,
'one.': 1, 'bright': 1, ...
0 0
Best 2 Pack Baby Car
Shade for Kids - Window ...
I love this product very
mush . I have bought ...
5.0 {'and': 1, 'love': 1,
'keeps': 1, 'shades': 1, ...
0 0
fantastic amazing love horrible bad terrible awful wow hate
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 2 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0
[10 rows x 15 columns]


In [16]:
products['awesome'].show(view='Categorical')


Using the .sum() method on each of the new columns you created, answer the following questions

Out of the selected_words, which one is most used in the dataset? Which one is least used? Save these results to answer the quiz at the end.


In [18]:
for key in selected_words:
    print key + ' : ' + str(products[key].sum())


awesome : 2090
great : 45206
fantastic : 932
amazing : 1363
love : 42065
horrible : 734
bad : 3724
terrible : 748
awful : 383
wow : 144
hate : 1220

In [19]:
len(selected_words)


Out[19]:
11

In [21]:
# ignore all 3* reviews
products = products[products['rating'] != 3]

In [22]:
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [23]:
for key in selected_words:
    products[key] = products['word_count'].apply(lambda x: key_count(x, key))

In [24]:
for key in selected_words:
    print key + ' : ' + str(products[key].sum())


awesome : 2002
great : 42420
fantastic : 873
amazing : 1305
love : 40277
horrible : 659
bad : 3197
terrible : 673
awful : 345
wow : 131
hate : 1057

2. Create a new sentiment analysis model using only the selected_words as features


In [25]:
train_data,test_data = products.random_split(.8, seed=0)

In [26]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)


Logistic regression:
--------------------------------------------------------
Number of examples          : 133448
Number of classes           : 2
Number of feature columns   : 11
Number of unpacked features : 11
Number of coefficients    : 12
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 1.110246     | 0.844299          | 0.842842            |
| 2         | 3        | 1.176800     | 0.844186          | 0.842842            |
| 3         | 4        | 1.241676     | 0.844276          | 0.843142            |
| 4         | 5        | 1.306232     | 0.844269          | 0.843142            |
| 5         | 6        | 1.368675     | 0.844269          | 0.843142            |
| 6         | 7        | 1.449278     | 0.844269          | 0.843142            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

Coefficients

Using this approach, sort the learned coefficients according to the ‘value’ column using .sort(). Out of the 11 words in selected_words, which one got the most positive weight? Which one got the most negative weight? Do these values make sense for you? Save these results to answer the quiz at the end.


In [29]:
selected_words_model['coefficients'].sort('value', ascending=False).print_rows(12,5)


+-------------+-------+-------+------------------+------------------+
|     name    | index | class |      value       |      stderr      |
+-------------+-------+-------+------------------+------------------+
|     love    |  None |   1   |  1.39989834302   | 0.0287147460124  |
| (intercept) |  None |   1   |  1.36728315229   | 0.00861805467824 |
|   awesome   |  None |   1   |  1.05800888878   |  0.110865296265  |
|   amazing   |  None |   1   |  0.892802422508  |  0.127989503231  |
|  fantastic  |  None |   1   |  0.891303090304  |  0.154532343591  |
|    great    |  None |   1   |  0.883937894898  | 0.0217379527921  |
|     wow     |  None |   1   | -0.0541450123333 |  0.275616449416  |
|     bad     |  None |   1   | -0.985827369929  | 0.0433603009142  |
|     hate    |  None |   1   |  -1.40916406276  | 0.0771983993506  |
|    awful    |  None |   1   |  -1.76469955631  |  0.134679803365  |
|   horrible  |  None |   1   |  -1.99651800559  | 0.0973584169028  |
|   terrible  |  None |   1   |  -2.09049998487  | 0.0967241912229  |
+-------------+-------+-------+------------------+------------------+
[12 rows x 5 columns]

3. Comparing the accuracy of different sentiment analysis model


In [30]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)


WARNING: The number of feature dimensions in this problem is very large in comparison with the number of examples. Unless an appropriate regularization value is set, this model may not provide accurate predictions for a validation/test set.
Logistic regression:
--------------------------------------------------------
Number of examples          : 133448
Number of classes           : 2
Number of feature columns   : 1
Number of unpacked features : 219217
Number of coefficients    : 219218
Starting L-BFGS
--------------------------------------------------------
+-----------+----------+-----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+-----------+--------------+-------------------+---------------------+
| 1         | 5        | 0.000002  | 1.375268     | 0.841481          | 0.839989            |
| 2         | 9        | 3.000000  | 2.601635     | 0.947425          | 0.894877            |
| 3         | 10       | 3.000000  | 3.160441     | 0.923768          | 0.866232            |
| 4         | 11       | 3.000000  | 3.619839     | 0.971779          | 0.912743            |
| 5         | 12       | 3.000000  | 4.155964     | 0.975511          | 0.908900            |
| 6         | 13       | 3.000000  | 4.599294     | 0.899991          | 0.825967            |
| 10        | 18       | 1.000000  | 7.072463     | 0.988715          | 0.916256            |
+-----------+----------+-----------+--------------+-------------------+---------------------+
TERMINATED: Iteration limit reached.
This model may not be optimal. To improve it, consider increasing `max_iterations`.

In [31]:
sentiment_model.evaluate(test_data, metric='roc_curve')


Out[31]:
{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       |      1.0       | 27976 | 5328 |
 |   1e-05   | 0.909346846847 | 0.998856162425 | 27976 | 5328 |
 |   2e-05   | 0.896021021021 | 0.998748927652 | 27976 | 5328 |
 |   3e-05   | 0.886448948949 | 0.998462968259 | 27976 | 5328 |
 |   4e-05   | 0.879692192192 | 0.998284243637 | 27976 | 5328 |
 |   5e-05   | 0.875187687688 | 0.998212753789 | 27976 | 5328 |
 |   6e-05   | 0.872184684685 | 0.998177008865 | 27976 | 5328 |
 |   7e-05   | 0.868618618619 | 0.998034029168 | 27976 | 5328 |
 |   8e-05   | 0.864677177177 | 0.997998284244 | 27976 | 5328 |
 |   9e-05   | 0.860735735736 | 0.997962539319 | 27976 | 5328 |
 +-----------+----------------+----------------+-------+------+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [32]:
sentiment_model.show(view='Evaluation')



In [33]:
sentiment_model.evaluate(test_data)


Out[33]:
{'accuracy': 0.916256305548883,
 'auc': 0.9446492867438502,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1328 |
 |      0       |        0        |  4000 |
 |      1       |        1        | 26515 |
 |      1       |        0        |  1461 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9500349343413533,
 'log_loss': 0.2610669843242271,
 'precision': 0.9523039902309378,
 'recall': 0.9477766657134686,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       |      1.0       | 27976 | 5328 |
 |   1e-05   | 0.909346846847 | 0.998856162425 | 27976 | 5328 |
 |   2e-05   | 0.896021021021 | 0.998748927652 | 27976 | 5328 |
 |   3e-05   | 0.886448948949 | 0.998462968259 | 27976 | 5328 |
 |   4e-05   | 0.879692192192 | 0.998284243637 | 27976 | 5328 |
 |   5e-05   | 0.875187687688 | 0.998212753789 | 27976 | 5328 |
 |   6e-05   | 0.872184684685 | 0.998177008865 | 27976 | 5328 |
 |   7e-05   | 0.868618618619 | 0.998034029168 | 27976 | 5328 |
 |   8e-05   | 0.864677177177 | 0.997998284244 | 27976 | 5328 |
 |   9e-05   | 0.860735735736 | 0.997962539319 | 27976 | 5328 |
 +-----------+----------------+----------------+-------+------+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [34]:
selected_words_model.evaluate(test_data)


Out[34]:
{'accuracy': 0.8431419649291376,
 'auc': 0.6648096413721418,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        |  234  |
 |      0       |        1        |  5094 |
 |      1       |        1        | 27846 |
 |      1       |        0        |  130  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.914242563530107,
 'log_loss': 0.4054747110366022,
 'precision': 0.8453551912568306,
 'recall': 0.9953531598513011,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-------+------+
 | threshold | fpr | tpr |   p   |  n   |
 +-----------+-----+-----+-------+------+
 |    0.0    | 1.0 | 1.0 | 27976 | 5328 |
 |   1e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   2e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   3e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   4e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   5e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   6e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   7e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   8e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   9e-05   | 1.0 | 1.0 | 27976 | 5328 |
 +-----------+-----+-----+-------+------+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [35]:
selected_words_model.show(view='Evaluation')


What is the accuracy of the selected_words_model on the test_data? What was the accuracy of the sentiment_model that we learned using all the word counts in the IPython Notebook above from the lectures? What is the accuracy majority class classifier on this task? How do you compare the different learned models with the baseline approach where we are just predicting the majority class? Save these results to answer the quiz at the end.


In [36]:
test_data['sentiment'].show(view='Categorical')


4. Interpreting the difference in performance between the models


In [42]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']

In [43]:
len(diaper_champ_reviews)


Out[43]:
298

In [44]:
diaper_champ_reviews.head()


Out[44]:
name review rating word_count awesome great fantastic
Baby Trend Diaper Champ Ok - newsflash. Diapers
are just smelly. We've ...
4.0 {'son': 1, 'just': 2,
'less': 1, '-': 3, ...
0 0 0
Baby Trend Diaper Champ My husband and I selected
the Diaper "Champ" ma ...
1.0 {'material)': 1, 'bags,':
1, 'less': 1, 'when': 3, ...
0 0 0
Baby Trend Diaper Champ Excellent diaper disposal
unit. I used it in ...
5.0 {'control': 1, 'am': 1,
'it': 1, 'used': 1, ' ...
0 0 0
Baby Trend Diaper Champ We love our diaper champ.
It is very easy to use ...
5.0 {'and': 3, 'over.': 1,
'all': 1, 'bags.': 1, ...
0 0 0
Baby Trend Diaper Champ Two girlfriends and two
family members put me ...
5.0 {'just': 1, '-': 3,
'both': 1, 'results': 1, ...
0 0 0
Baby Trend Diaper Champ I waited to review this
until I saw how it ...
4.0 {'lysol': 1, 'all': 1,
'mom.': 1, 'busy': 1, ...
0 0 0
Baby Trend Diaper Champ I have had a diaper genie
for almost 4 years since ...
1.0 {'all': 1, 'bags.': 1,
'just': 1, "don't": 2, ...
0 0 0
Baby Trend Diaper Champ I originally put this
item on my baby registry ...
5.0 {'lysol': 1, 'all': 2,
'bags.': 1, 'feedback': ...
0 0 0
Baby Trend Diaper Champ I am so glad I got the
Diaper Champ instead of ...
5.0 {'and': 2, 'all': 1,
'just': 1, 'is': 2, ' ...
0 0 0
Baby Trend Diaper Champ We had 2 diaper Genie's
both given to us as a ...
4.0 {'hand.': 1, 'both': 1,
'(required': 1, 'befo ...
0 0 0
amazing love horrible bad terrible awful wow hate sentiment
0 0 0 0 0 0 0 0 1
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1
0 1 0 0 0 0 0 0 1
0 0 1 0 0 0 0 0 1
0 0 0 1 0 0 0 0 1
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1
0 0 0 0 0 0 0 0 1
0 2 0 0 0 0 0 0 1
[10 rows x 16 columns]


In [46]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')

What is the ‘predicted_sentiment’ for the most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model from the IPython Notebook from lecture?

Save this result to answer the quiz at the end.


In [47]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)

In [48]:
diaper_champ_reviews.head()


Out[48]:
name review rating word_count awesome great fantastic
Baby Trend Diaper Champ Baby Luke can turn a
clean diaper to a dirty ...
5.0 {'all': 1, 'less': 1,
"friend's": 1, '(which': ...
0 0 0
Baby Trend Diaper Champ I LOOOVE this diaper
pail! Its the easies ...
5.0 {'just': 1, 'over': 1,
'rweek': 1, 'sooo': 1, ...
0 0 0
Baby Trend Diaper Champ We researched all of the
different types of di ...
4.0 {'all': 2, 'just': 4,
"don't": 2, 'one,': 1, ...
0 0 0
Baby Trend Diaper Champ My baby is now 8 months
and the can has been ...
5.0 {"don't": 1, 'able': 2,
'over': 1, 'soon': 1, ...
0 2 0
Baby Trend Diaper Champ This is absolutely, by
far, the best diaper ...
5.0 {'just': 3, 'money': 1,
'still': 3, 'fine': 1, ...
0 0 0
Baby Trend Diaper Champ Diaper Champ or Diaper
Genie? That was my ...
5.0 {'son': 2, 'all': 1,
'bags.': 1, 'son,': 1, ...
0 0 0
Baby Trend Diaper Champ Wow! This is fabulous.
It was a toss-up between ...
5.0 {'and': 4, 'this': 3,
'stink': 1, 'garbage' ...
0 0 0
Baby Trend Diaper Champ I originally put this
item on my baby registry ...
5.0 {'lysol': 1, 'all': 2,
'bags.': 1, 'feedback': ...
0 0 0
Baby Trend Diaper Champ Two girlfriends and two
family members put me ...
5.0 {'just': 1, '-': 3,
'both': 1, 'results': 1, ...
0 0 0
Baby Trend Diaper Champ I am one of those super-
critical shoppers who ...
5.0 {'all': 1, 'humid': 1,
'just': 1, 'less': 1, ...
0 0 0
amazing love horrible bad terrible awful wow hate sentiment predicted_sentiment
0 0 0 0 0 0 0 0 1 0.999999937267
0 1 0 0 0 0 0 0 1 0.999999917406
0 0 0 1 0 0 0 0 1 0.999999899509
0 0 0 1 0 0 0 0 1 0.999999836182
0 2 0 0 0 0 0 0 1 0.999999824745
0 0 0 0 0 0 0 0 1 0.999999759315
0 0 0 0 0 0 0 0 1 0.999999692111
0 0 0 0 0 0 0 0 1 0.999999642488
0 0 1 0 0 0 0 0 1 0.999999604504
0 1 0 0 0 0 0 0 1 0.999999486804
[10 rows x 17 columns]

Now use the selected_words_model you learned using just the selected_words to predict the sentiment most positive review you found above. Save this result to answer the quiz at the end.


In [49]:
selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')


Out[49]:
dtype: float
Rows: 1
[0.796940851290673]

Why is the predicted_sentiment for the most positive review found using the model with all word counts (sentiment_model) much more positive than the one using only the selected_words (selected_words_model)? Hint: examine the text of this review, the extracted word counts for all words, and the word counts for each of the selected_words, and you will see what each model used to make its prediction. Save this result to answer the quiz at the end.


In [50]:
diaper_champ_reviews[0]['review']


Out[50]:
'Baby Luke can turn a clean diaper to a dirty diaper in 3 seconds flat. The diaper champ turns the smelly diaper into "what diaper smell" in less time than that. I hesitated and wondered what I REALLY needed for the nursery. This is one of the best purchases we made. The champ, the baby bjorn, fluerville diaper bag, and graco pack and play bassinet all vie for the best baby purchase.Great product, easy to use, economical, effective, absolutly fabulous.UpdateI knew that I loved the champ, and useing the diaper genie at a friend\'s house REALLY reinforced that!! There is no comparison, the chanp is easy and smell free, the genie was difficult to use one handed (which is absolutly vital if you have a little one on a changing pad) and there was a deffinite odor eminating from the genieplus we found that the quick tie garbage bags where the ties are integrated into the bag work really well because there isn\'t any added bulk around the sealing edge of the champ.'

In [51]:
diaper_champ_reviews[0]['word_count']


Out[51]:
{'"what': 1,
 '(which': 1,
 '3': 1,
 'a': 6,
 'absolutly': 2,
 'added': 1,
 'all': 1,
 'and': 6,
 'any': 1,
 'are': 1,
 'around': 1,
 'at': 1,
 'baby': 3,
 'bag': 1,
 'bag,': 1,
 'bags': 1,
 'bassinet': 1,
 'because': 1,
 'best': 2,
 'bjorn,': 1,
 'bulk': 1,
 'can': 1,
 'champ': 1,
 'champ,': 2,
 'champ.': 1,
 'changing': 1,
 'chanp': 1,
 'clean': 1,
 'comparison,': 1,
 'deffinite': 1,
 'diaper': 7,
 'difficult': 1,
 'dirty': 1,
 'easy': 2,
 'economical,': 1,
 'edge': 1,
 'effective,': 1,
 'eminating': 1,
 'fabulous.updatei': 1,
 'flat.': 1,
 'fluerville': 1,
 'for': 2,
 'found': 1,
 'free,': 1,
 "friend's": 1,
 'from': 1,
 'garbage': 1,
 'genie': 2,
 'genieplus': 1,
 'graco': 1,
 'handed': 1,
 'have': 1,
 'hesitated': 1,
 'house': 1,
 'i': 3,
 'if': 1,
 'in': 2,
 'integrated': 1,
 'into': 2,
 'is': 4,
 "isn't": 1,
 'knew': 1,
 'less': 1,
 'little': 1,
 'loved': 1,
 'luke': 1,
 'made.': 1,
 'needed': 1,
 'no': 1,
 'nursery.': 1,
 'odor': 1,
 'of': 2,
 'on': 1,
 'one': 3,
 'pack': 1,
 'pad)': 1,
 'play': 1,
 'product,': 1,
 'purchase.great': 1,
 'purchases': 1,
 'quick': 1,
 'really': 3,
 'reinforced': 1,
 'sealing': 1,
 'seconds': 1,
 'smell': 1,
 'smell"': 1,
 'smelly': 1,
 'than': 1,
 'that': 2,
 'that!!': 1,
 'that.': 1,
 'the': 17,
 'there': 3,
 'this': 1,
 'tie': 1,
 'ties': 1,
 'time': 1,
 'to': 3,
 'turn': 1,
 'turns': 1,
 'use': 1,
 'use,': 1,
 'useing': 1,
 'vie': 1,
 'vital': 1,
 'was': 2,
 'we': 2,
 'well': 1,
 'what': 1,
 'where': 1,
 'wondered': 1,
 'work': 1,
 'you': 1}

In [52]:
diaper_champ_reviews[0]


Out[52]:
{'amazing': 0,
 'awesome': 0,
 'awful': 0,
 'bad': 0,
 'fantastic': 0,
 'great': 0,
 'hate': 0,
 'horrible': 0,
 'love': 0,
 'name': 'Baby Trend Diaper Champ',
 'predicted_sentiment': 0.9999999372669541,
 'rating': 5.0,
 'review': 'Baby Luke can turn a clean diaper to a dirty diaper in 3 seconds flat. The diaper champ turns the smelly diaper into "what diaper smell" in less time than that. I hesitated and wondered what I REALLY needed for the nursery. This is one of the best purchases we made. The champ, the baby bjorn, fluerville diaper bag, and graco pack and play bassinet all vie for the best baby purchase.Great product, easy to use, economical, effective, absolutly fabulous.UpdateI knew that I loved the champ, and useing the diaper genie at a friend\'s house REALLY reinforced that!! There is no comparison, the chanp is easy and smell free, the genie was difficult to use one handed (which is absolutly vital if you have a little one on a changing pad) and there was a deffinite odor eminating from the genieplus we found that the quick tie garbage bags where the ties are integrated into the bag work really well because there isn\'t any added bulk around the sealing edge of the champ.',
 'sentiment': 1,
 'terrible': 0,
 'word_count': {'"what': 1,
  '(which': 1,
  '3': 1,
  'a': 6,
  'absolutly': 2,
  'added': 1,
  'all': 1,
  'and': 6,
  'any': 1,
  'are': 1,
  'around': 1,
  'at': 1,
  'baby': 3,
  'bag': 1,
  'bag,': 1,
  'bags': 1,
  'bassinet': 1,
  'because': 1,
  'best': 2,
  'bjorn,': 1,
  'bulk': 1,
  'can': 1,
  'champ': 1,
  'champ,': 2,
  'champ.': 1,
  'changing': 1,
  'chanp': 1,
  'clean': 1,
  'comparison,': 1,
  'deffinite': 1,
  'diaper': 7,
  'difficult': 1,
  'dirty': 1,
  'easy': 2,
  'economical,': 1,
  'edge': 1,
  'effective,': 1,
  'eminating': 1,
  'fabulous.updatei': 1,
  'flat.': 1,
  'fluerville': 1,
  'for': 2,
  'found': 1,
  'free,': 1,
  "friend's": 1,
  'from': 1,
  'garbage': 1,
  'genie': 2,
  'genieplus': 1,
  'graco': 1,
  'handed': 1,
  'have': 1,
  'hesitated': 1,
  'house': 1,
  'i': 3,
  'if': 1,
  'in': 2,
  'integrated': 1,
  'into': 2,
  'is': 4,
  "isn't": 1,
  'knew': 1,
  'less': 1,
  'little': 1,
  'loved': 1,
  'luke': 1,
  'made.': 1,
  'needed': 1,
  'no': 1,
  'nursery.': 1,
  'odor': 1,
  'of': 2,
  'on': 1,
  'one': 3,
  'pack': 1,
  'pad)': 1,
  'play': 1,
  'product,': 1,
  'purchase.great': 1,
  'purchases': 1,
  'quick': 1,
  'really': 3,
  'reinforced': 1,
  'sealing': 1,
  'seconds': 1,
  'smell': 1,
  'smell"': 1,
  'smelly': 1,
  'than': 1,
  'that': 2,
  'that!!': 1,
  'that.': 1,
  'the': 17,
  'there': 3,
  'this': 1,
  'tie': 1,
  'ties': 1,
  'time': 1,
  'to': 3,
  'turn': 1,
  'turns': 1,
  'use': 1,
  'use,': 1,
  'useing': 1,
  'vie': 1,
  'vital': 1,
  'was': 2,
  'we': 2,
  'well': 1,
  'what': 1,
  'where': 1,
  'wondered': 1,
  'work': 1,
  'you': 1},
 'wow': 0}

That's all folks!


In [ ]: